## "replication_causalforest.R" from replication package for "Registering Returning Citizens to Vote"
## This script reproduces Figure A5 and Table A8 from the SI (Athey/Wager approach to heterogeneity)

rm(list = ls())
library(tidyverse)
library(data.table)
library(mltools)
library(grf)
library(xtable)

###########################################################################
# Prepare data
###########################################################################

main <- read_csv("main_paper_2025.csv")
# extract north carolina data
main <- main[(main$list == "NCmainlist"), ]

# set up indicator for any treatment (ie not control)
main$anytreat <- 1; 
main[main$Treatment == "T1", "anytreat"] <- 0; table(main$anytreat)

#make age & release bins so they're evenly-divided within NC
main$agebin <- as.numeric(cut_number(main$ageyears,5)) #split age out into bins instead of years

main$dayssincerelease <- as.Date("2020-11-03") - main$lastincarcend
main$yearssincerelease <- main$dayssincerelease/365.25
main$releasebin <- as.numeric(cut_number(as.numeric(main$yearssincerelease), 5))
main %>% dplyr::select(pastincarc, lastincarcend, ends_with("sincerelease")) %>% head()

###########################################################################
# fit causal forest 
###########################################################################
variablestouse <- c("male", "wrublack", "wruhispanic", "agebin", "pastsupervision", "releasebin") 

# select relevant columns
# drop observations with NAs in any columns
 main %>% 
   dplyr::select(variablestouse, anytreat, reg2020) %>% 
   {. ->> tmp} %>% 
   drop_na() -> main.na.drop

main <- main.na.drop #in this version, to include releasebin, we're dropping people with no prev incarc. Figure A5 doesn't look that different (for possible-to-include covars) if instead keeping all regardless of incarc status and not using releasebin
 
# formula used for the purpose of creating dummy variables
X0 <- data.frame(lapply(main[, variablestouse], as.factor)) #keep some columns, make factors
X <- model.matrix(~., model.frame(~ ., X0, na.action=na.pass))[, -1]

W <- main$anytreat
Y <- main$reg2020/100

#start using their code! from pg. 21
set.seed(14609)
Y.forest <- regression_forest(X, Y)
Y.hat <- predict(Y.forest)$predictions
W.forest <- regression_forest(X, W)
W.hat <- predict(W.forest)$predictions

Y.forest.varimp <- variable_importance(Y.forest)

# Note from the package website: Forests may have a hard time when trained on very few variables
# (e.g., ncol(X) = 1, 2, or 3). We recommend not being too aggressive
# in selection.
selected.vars <- which(Y.forest.varimp / mean(Y.forest.varimp) > 0.2); selected.vars

# selecting variables: not too aggressive
tau.forest <- causal_forest(X[, selected.vars], Y, W,
  W.hat = W.hat, Y.hat = Y.hat
)

#saveRDS(object = list(tau.forest = tau.forest, Y.forest.varimp = Y.forest.varimp), 
#        file = "tmp/tau_forests_use.rds")

###########################################################################
# explore results
###########################################################################
#tau.forests <- readRDS("tmp/tau_forests_use.rds")
#tau.forest <- tau.forests$tau.forest
#Y.forest.varimp <- tau.forests$Y.forest.varimp

bind_cols(variable = colnames(X), importance = Y.forest.varimp[, 1]) %>% 
  arrange(desc(importance)) %>%
  xtable() %>% 
  print(include.rownames = FALSE, file = "cf_mod1_varimp.tex") 


tibble(x = c("Black", "Non-black", "Male", "Female", 
             "Agebin1", "Agebin2", "Agebin3", "Agebin4", "Agebin5","PastSupervision", "NoPastSupervision", "Releasebin1", "Releasebin2", "Releasebin3", "Releasebin4", "Releasebin5"), 
       y = c(average_treatment_effect(tau.forest, 
                                      subset = X[, "wrublack1"] == 1)[1], 
             average_treatment_effect(tau.forest, 
                                      subset = X[, "wrublack1"] == 0)[1],
             average_treatment_effect(tau.forest, 
                                      subset = X[, "male1"] == 1)[1], 
             average_treatment_effect(tau.forest, 
                                      subset = X[, "male1"] == 0)[1], 
             average_treatment_effect(tau.forest, 
                                      subset = X[, "agebin2"] == 0 & X[, "agebin3"] == 0 & 
                                        X[, "agebin4"] == 0 & X[, "agebin5"] == 0)[1], 
             average_treatment_effect(tau.forest, 
                                      subset = X[, "agebin2"] == 1 & X[, "agebin3"] == 0 & 
                                        X[, "agebin4"] == 0 & X[, "agebin5"] == 0)[1], 
             average_treatment_effect(tau.forest, 
                                      subset = X[, "agebin2"] == 0 & X[, "agebin3"] == 1 & 
                                        X[, "agebin4"] == 0 & X[, "agebin5"] == 0)[1], 
             average_treatment_effect(tau.forest, 
                                      subset = X[, "agebin2"] == 0 & X[, "agebin3"] == 0 & 
                                        X[, "agebin4"] == 1 & X[, "agebin5"] == 0)[1], 
             average_treatment_effect(tau.forest, 
                                      subset = X[, "agebin2"] == 0 & X[, "agebin3"] == 0 & 
                                        X[, "agebin4"] == 0 & X[, "agebin5"] == 1)[1],
             average_treatment_effect(tau.forest, 
                                      subset = X[, "pastsupervision1"] == 1)[1], 
             average_treatment_effect(tau.forest, 
                                      subset = X[, "pastsupervision1"] == 0)[1],
             average_treatment_effect(tau.forest, 
                                      subset = X[, "releasebin2"] == 0 & X[, "releasebin3"] == 0 & 
                                        X[, "releasebin4"] == 0 & X[, "releasebin5"] == 0)[1], 
             average_treatment_effect(tau.forest, 
                                      subset = X[, "releasebin2"] == 1 & X[, "releasebin3"] == 0 & 
                                        X[, "releasebin4"] == 0 & X[, "releasebin5"] == 0)[1], 
             average_treatment_effect(tau.forest, 
                                      subset = X[, "releasebin2"] == 0 & X[, "releasebin3"] == 1 & 
                                        X[, "releasebin4"] == 0 & X[, "releasebin5"] == 0)[1], 
             average_treatment_effect(tau.forest, 
                                      subset = X[, "releasebin2"] == 0 & X[, "releasebin3"] == 0 & 
                                        X[, "releasebin4"] == 1 & X[, "releasebin5"] == 0)[1], 
             average_treatment_effect(tau.forest, 
                                      subset = X[, "releasebin2"] == 0 & X[, "releasebin3"] == 0 & 
                                        X[, "releasebin4"] == 0 & X[, "releasebin5"] == 1)[1]),
       sd = c(average_treatment_effect(tau.forest, 
                                       subset = X[, "wrublack1"] == 1)[2], 
              average_treatment_effect(tau.forest, 
                                       subset = X[, "wrublack1"] == 0)[2],
              average_treatment_effect(tau.forest, 
                                       subset = X[, "male1"] == 1)[2], 
              average_treatment_effect(tau.forest, 
                                       subset = X[, "male1"] == 0)[2], 
              average_treatment_effect(tau.forest, 
                                       subset = X[, "agebin2"] == 0 & X[, "agebin3"] == 0 & 
                                         X[, "agebin4"] == 0 & X[, "agebin5"] == 0)[2], 
              average_treatment_effect(tau.forest, 
                                       subset = X[, "agebin2"] == 1 & X[, "agebin3"] == 0 & 
                                         X[, "agebin4"] == 0 & X[, "agebin5"] == 0)[2], 
              average_treatment_effect(tau.forest, 
                                       subset = X[, "agebin2"] == 0 & X[, "agebin3"] == 1 & 
                                         X[, "agebin4"] == 0 & X[, "agebin5"] == 0)[2], 
              average_treatment_effect(tau.forest, 
                                       subset = X[, "agebin2"] == 0 & X[, "agebin3"] == 0 & 
                                         X[, "agebin4"] == 1 & X[, "agebin5"] == 0)[2], 
              average_treatment_effect(tau.forest, 
                                       subset = X[, "agebin2"] == 0 & X[, "agebin3"] == 0 & 
                                         X[, "agebin4"] == 0 & X[, "agebin5"] == 1)[2],
             average_treatment_effect(tau.forest, 
                                      subset = X[, "pastsupervision1"] == 1)[2], 
             average_treatment_effect(tau.forest, 
                                      subset = X[, "pastsupervision1"] == 0)[2],
              average_treatment_effect(tau.forest, 
                                       subset = X[, "releasebin2"] == 0 & X[, "releasebin3"] == 0 & 
                                         X[, "releasebin4"] == 0 & X[, "releasebin5"] == 0)[2], 
              average_treatment_effect(tau.forest, 
                                       subset = X[, "releasebin2"] == 1 & X[, "releasebin3"] == 0 & 
                                         X[, "releasebin4"] == 0 & X[, "releasebin5"] == 0)[2], 
              average_treatment_effect(tau.forest, 
                                       subset = X[, "releasebin2"] == 0 & X[, "releasebin3"] == 1 & 
                                         X[, "releasebin4"] == 0 & X[, "releasebin5"] == 0)[2], 
              average_treatment_effect(tau.forest, 
                                       subset = X[, "releasebin2"] == 0 & X[, "releasebin3"] == 0 & 
                                         X[, "releasebin4"] == 1 & X[, "releasebin5"] == 0)[2], 
              average_treatment_effect(tau.forest, 
                                       subset = X[, "releasebin2"] == 0 & X[, "releasebin3"] == 0 & 
                                         X[, "releasebin4"] == 0 & X[, "releasebin5"] == 1)[2]

)
) %>% 
  mutate(upper = y + 1.96 * sd, lower = y - 1.96 * sd) %>% 
  mutate(x = factor(x, levels = c("Black", "Non-black", "Male", "Female", 
                                  "Agebin1", "Agebin2", "Agebin3", "Agebin4", "Agebin5","PastSupervision", "NoPastSupervision","Releasebin1", "Releasebin2", "Releasebin3", "Releasebin4", "Releasebin5"))) %>% 
  ggplot() + 
  geom_point(aes(x = x, y = y)) + 
  geom_linerange(aes(x = x, ymin = lower, ymax = upper), lwd = 1) + 
  xlab("Covariate") + 
  ylab("Treatment effect") + 
  theme_bw() -> p

ggsave(p, filename = "cv_mod1_coef.pdf", width = 16, height = 5)

